Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alef/newmerge #482

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Globalization;
using Elastic.Clients.Elasticsearch;
using Microsoft.KernelMemory;
using Microsoft.KernelMemory.AI;
using Microsoft.KernelMemory.AI.OpenAI;
using Microsoft.KernelMemory.DataFormats.Text;
using Microsoft.KernelMemory.MemoryDb.Elasticsearch;
using Microsoft.KernelMemory.MemoryStorage;
using Microsoft.KM.TestHelpers;
using Xunit;
using Xunit.Abstractions;

namespace Microsoft.Elasticsearch.FunctionalTests.Additional;

public class DataStorageTests : MemoryDbFunctionalTest
{
public DataStorageTests(
IConfiguration cfg,
ITestOutputHelper output)
: base(cfg, output)
{ }

[Fact]
public async Task CanUpsertOneTextDocumentAndDeleteAsync()
{
// We upsert the file
var docIds = await DataStorageTests.UpsertTextFilesAsync(
memoryDb: this.MemoryDb,
textEmbeddingGenerator: this.TextEmbeddingGenerator,
output: this.Output,
indexName: nameof(CanUpsertOneTextDocumentAndDeleteAsync),
fileNames: new[]
{
TestsHelper.WikipediaCarbonFileName
}).ConfigureAwait(false);

// Deletes the document
var deletes = docIds.Select(id => new MemoryRecord()
{
Id = id
});

foreach (var deleteRec in deletes)
{
await this.MemoryDb.DeleteAsync(nameof(CanUpsertOneTextDocumentAndDeleteAsync), deleteRec)
.ConfigureAwait(false);
}

// Verifies that the documents are gone
var indexName = IndexNameHelper.Convert(nameof(CanUpsertOneTextDocumentAndDeleteAsync), base.ElasticsearchConfig);
var res = await this.Client.CountAsync(r => r.Index(indexName))
.ConfigureAwait(false);
Assert.Equal(0, res.Count);
}

[Fact]
public async Task CanUpsertTwoTextFilesAndGetSimilarListAsync()
{
await DataStorageTests.UpsertTextFilesAsync(
memoryDb: this.MemoryDb,
textEmbeddingGenerator: this.TextEmbeddingGenerator,
output: this.Output,
indexName: nameof(CanUpsertTwoTextFilesAndGetSimilarListAsync),
fileNames: new[]
{
TestsHelper.WikipediaCarbonFileName,
TestsHelper.WikipediaMoonFilename
}).ConfigureAwait(false);

// Gets documents that are similar to the word "carbon" .
var foundSomething = false;

var textToMatch = "carbon";
await foreach (var result in this.MemoryDb.GetSimilarListAsync(
index: nameof(CanUpsertTwoTextFilesAndGetSimilarListAsync),
text: textToMatch,
limit: 1))
{
this.Output.WriteLine($"Found a document matching '{textToMatch}': {result.Item1.Payload["file"]}.");
return;
};

Assert.True(foundSomething, "It should have found something...");
}

public static string GuidWithoutDashes() => Guid.NewGuid().ToString().Replace("-", "", StringComparison.OrdinalIgnoreCase).ToLower(CultureInfo.CurrentCulture);

public static async Task<IEnumerable<string>> UpsertTextFilesAsync(
IMemoryDb memoryDb,
ITextEmbeddingGenerator textEmbeddingGenerator,
ITestOutputHelper output,
string indexName,
IEnumerable<string> fileNames)
{
ArgumentNullException.ThrowIfNull(memoryDb);
ArgumentNullException.ThrowIfNull(textEmbeddingGenerator);
ArgumentNullException.ThrowIfNull(output);
ArgumentNullException.ThrowIfNull(indexName);
ArgumentNullException.ThrowIfNull(fileNames);

// IMemoryDb does not create the index automatically.
await memoryDb.CreateIndexAsync(indexName, 1536)
.ConfigureAwait(false);

var results = new List<string>();
foreach (var fileName in fileNames)
{
// Reads the text from the file
string fullText = await File.ReadAllTextAsync(fileName)
.ConfigureAwait(false);

// Splits the text into lines of up to 1000 tokens each
#pragma warning disable KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.
var lines = TextChunker.SplitPlainTextLines(fullText,
maxTokensPerLine: 1000,
tokenCounter: null);

// Splits the line into paragraphs
var paragraphs = TextChunker.SplitPlainTextParagraphs(lines,
maxTokensPerParagraph: 1000,
overlapTokens: 100);
#pragma warning restore KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.

output.WriteLine($"File '{fileName}' contains {paragraphs.Count} paragraphs.");

// Indexes each paragraph as a separate document
var paraIdx = 0;
var documentId = GuidWithoutDashes() + GuidWithoutDashes();
var fileId = GuidWithoutDashes();

foreach (var paragraph in paragraphs)
{
var embedding = await textEmbeddingGenerator.GenerateEmbeddingAsync(paragraph)
.ConfigureAwait(false);

output.WriteLine($"Indexed paragraph {++paraIdx}/{paragraphs.Count}. {paragraph.Length} characters.");

var filePartId = GuidWithoutDashes();

var esId = $"d={documentId}//p={filePartId}";

var mrec = new MemoryRecord()
{
Id = esId,
Payload = new Dictionary<string, object>()
{
{ "file", fileName },
{ "text", paragraph },
{ "vector_provider", textEmbeddingGenerator.GetType().Name },
{ "vector_generator", "TODO" },
{ "last_update", DateTime.UtcNow.ToString("yyyy-MM-ddTHH:mm:ss") },
{ "text_embedding_generator", textEmbeddingGenerator.GetType().Name }
},
Tags = new TagCollection()
{
{ "__document_id", documentId },
{ "__file_type", "text/plain" },
{ "__file_id", fileId },
{ "__file_part", filePartId }

},
Vector = embedding
};

var res = await memoryDb.UpsertAsync(indexName, mrec)
.ConfigureAwait(false);

results.Add(res);
}

output.WriteLine("");
}

return results;
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright (c) Microsoft. All rights reserved.

using Elastic.Clients.Elasticsearch;
using Microsoft.KernelMemory.AI.OpenAI;
using Microsoft.KernelMemory.MemoryDb.Elasticsearch;
using Microsoft.KernelMemory.MemoryStorage;
using Microsoft.KM.TestHelpers;
using Xunit;
using Xunit.Abstractions;

namespace Microsoft.Elasticsearch.FunctionalTests.Additional;

public class IndexManagementTests : MemoryDbFunctionalTest
{
public IndexManagementTests(
IConfiguration cfg,
ITestOutputHelper output)
: base(cfg, output)
{ }

[Fact]
public async Task CanCreateAndDeleteIndexAsync()
{
var indexName = nameof(CanCreateAndDeleteIndexAsync);
var vectorSize = 1536;

// Creates the index using IMemoryDb
await this.MemoryDb.CreateIndexAsync(indexName, vectorSize)
.ConfigureAwait(false);

// Verifies the index is created using the ES client
var actualIndexName = IndexNameHelper.Convert(nameof(CanCreateAndDeleteIndexAsync), base.ElasticsearchConfig);
var resp = await this.Client.Indices.ExistsAsync(actualIndexName)
.ConfigureAwait(false);
Assert.True(resp.Exists);
this.Output.WriteLine($"The index '{actualIndexName}' was created successfully.");

// Deletes the index
await this.MemoryDb.DeleteIndexAsync(indexName)
.ConfigureAwait(false);

// Verifies the index is deleted using the ES client
resp = await this.Client.Indices.ExistsAsync(actualIndexName)
.ConfigureAwait(false);
Assert.False(resp.Exists);
this.Output.WriteLine($"The index '{actualIndexName}' was deleted successfully.");
}

[Fact]
public async Task CanGetIndicesAsync()
{
var indexNames = new[]
{
IndexNameHelper.Convert(nameof(CanGetIndicesAsync) + "-First", base.ElasticsearchConfig),
IndexNameHelper.Convert(nameof(CanGetIndicesAsync) + "-Second", base.ElasticsearchConfig)
};

// Creates the indices using IMemoryDb
foreach (var indexName in indexNames)
{
await this.MemoryDb.CreateIndexAsync(indexName, 1536)
.ConfigureAwait(false);
}

// Verifies the indices are returned
var indices = await this.MemoryDb.GetIndexesAsync()
.ConfigureAwait(false);

Assert.True(indices.All(nme => indices.Contains(nme)));

// Cleans up
foreach (var indexName in indexNames)
{
await this.MemoryDb.DeleteIndexAsync(indexName)
.ConfigureAwait(false);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.KernelMemory.MemoryDb.Elasticsearch;
using Microsoft.KM.TestHelpers;
using Xunit;
using Xunit.Abstractions;

namespace Microsoft.Elasticsearch.FunctionalTests.Additional;

public class IndexnameTests : BaseFunctionalTestCase
{
private readonly ITestOutputHelper _output;

public IndexnameTests(IConfiguration cfg, ITestOutputHelper output)
: base(cfg, output)
{
this._output = output ?? throw new ArgumentNullException(nameof(output));
}

[Theory]
[InlineData("")] // default index
[InlineData("nondefault")]
[InlineData("WithUppercase")]
[InlineData("With-Dashes")]
[InlineData("123numberfirst")]
public void GoodIndexNamesAreAccepted(string indexName)
{
Assert.True(IndexNameHelper.TryConvert(indexName, base.ElasticsearchConfig, out var convResult));
Assert.Empty(convResult.Errors);

this._output.WriteLine($"The index name '{indexName}' will be translated to '{convResult.ActualIndexName}'.");
}

[Theory]
// An index name cannot start with a hyphen (-) or underscore (_).
//[InlineData("-test", 1)]
//[InlineData("test_", 1)]
// An index name can only contain letters, digits, and hyphens (-).
[InlineData("test space", 1)]
[InlineData("test/slash", 1)]
[InlineData("test\\backslash", 1)]
[InlineData("test.dot", 1)]
[InlineData("test:colon", 1)]
[InlineData("test*asterisk", 1)]
[InlineData("test<less", 1)]
[InlineData("test>greater", 1)]
[InlineData("test|pipe", 1)]
[InlineData("test?question", 1)]
[InlineData("test\"quote", 1)]
[InlineData("test'quote", 1)]
[InlineData("test`backtick", 1)]
[InlineData("test~tilde", 1)]
[InlineData("test!exclamation", 1)]
// Avoid names that are dot-only or dot and numbers
// Multi error
[InlineData(".", 1)]
[InlineData("..", 1)]
[InlineData("1.2.3", 1)]
//[InlineData("_test", 1)]

public void BadIndexNamesAreRejected(string indexName, int errorCount)
{
// Creates the index using IMemoryDb
var exception = Assert.Throws<InvalidIndexNameException>(() =>
{
IndexNameHelper.Convert(indexName, base.ElasticsearchConfig);
});

this._output.WriteLine(
$"The index name '{indexName}' had the following errors:\n{string.Join("\n", exception.Errors)}" +
$"" +
$"The expected number of errors was {errorCount}.");

Assert.True(errorCount == exception.Errors.Count(), $"The number of errprs expected is different than the number of errors found.");
}

[Fact]
public void IndexNameCannotBeLongerThan255Bytes()
{
var indexName = new string('a', 256);
var exception = Assert.Throws<InvalidIndexNameException>(() =>
{
IndexNameHelper.Convert(indexName, base.ElasticsearchConfig);
});

Assert.Equal(1, exception.Errors.Count());
}
}
Loading
Loading