Skip to content

Commit

Permalink
String normalization extension
Browse files Browse the repository at this point in the history
  • Loading branch information
dluc committed Feb 7, 2025
1 parent 353b662 commit 13613e3
Show file tree
Hide file tree
Showing 10 changed files with 177 additions and 20 deletions.
6 changes: 2 additions & 4 deletions extensions/Chunkers/Chunkers/MarkDownChunker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
using Microsoft.KernelMemory.AI;
using Microsoft.KernelMemory.Chunkers.internals;
using Microsoft.KernelMemory.DataFormats;
using Microsoft.KernelMemory.Text;

namespace Microsoft.KernelMemory.Chunkers;

Expand Down Expand Up @@ -152,10 +153,7 @@ public List<string> Split(string text, MarkDownChunkerOptions options)
ArgumentNullException.ThrowIfNull(options);

// Clean up text. Note: LLMs don't use \r char
text = text
.Replace("\r\n", "\n", StringComparison.OrdinalIgnoreCase)
.Replace("\r", "\n", StringComparison.OrdinalIgnoreCase)
.Trim();
text = text.NormalizeNewlines(true);

// Calculate chunk size leaving room for the optional chunk header
int maxChunk1Size = options.MaxTokensPerChunk - this.TokenCount(options.ChunkHeader);
Expand Down
6 changes: 2 additions & 4 deletions extensions/Chunkers/Chunkers/PlainTextChunker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
using Microsoft.KernelMemory.AI;
using Microsoft.KernelMemory.Chunkers.internals;
using Microsoft.KernelMemory.DataFormats;
using Microsoft.KernelMemory.Text;

namespace Microsoft.KernelMemory.Chunkers;

Expand Down Expand Up @@ -127,10 +128,7 @@ public List<string> Split(string text, PlainTextChunkerOptions options)
ArgumentNullException.ThrowIfNull(options);

// Clean up text. Note: LLMs don't use \r char
text = text
.Replace("\r\n", "\n", StringComparison.OrdinalIgnoreCase)
.Replace("\r", "\n", StringComparison.OrdinalIgnoreCase)
.Trim();
text = text.NormalizeNewlines(true);

// Calculate chunk size leaving room for the optional chunk header
int maxChunk1Size = options.MaxTokensPerChunk - this.TokenCount(options.ChunkHeader);
Expand Down
76 changes: 76 additions & 0 deletions service/Abstractions/Text/StringExtensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright (c) Microsoft. All rights reserved.

namespace Microsoft.KernelMemory.Text;

public static class StringExtensions
{
public static string NormalizeNewlines(this string text, bool trim = false)
{
if (string.IsNullOrEmpty(text))
{
return text;
}

// We won't need more than the original length
char[] buffer = new char[text.Length];
int bufferPos = 0;

// Skip leading whitespace if trimming
int i = 0;
if (trim)
{
while (i < text.Length && char.IsWhiteSpace(text[i]))
{
i++;
}
}

// Tracks the last non-whitespace position written into buffer
int lastNonWhitespacePos = -1;

// 2) Single pass: replace \r\n or \r with \n, record last non-whitespace
for (; i < text.Length; i++)
{
char c = text[i];

if (c == '\r')
{
// If \r\n then skip the \n
if (i + 1 < text.Length && text[i + 1] == '\n')
{
i++;
}

// Write a single \n
buffer[bufferPos] = '\n';
}
else
{
buffer[bufferPos] = c;
}

// If trimming, update lastNonWhitespacePos only when char isn't whitespace
// If not trimming, always update because we keep everything
if (!trim || !char.IsWhiteSpace(buffer[bufferPos]))
{
lastNonWhitespacePos = bufferPos;
}

bufferPos++;
}

// Cut off trailing whitespace if trimming
// If every char was whitespace, lastNonWhitespacePos stays -1 and the result is an empty string
int finalLength = (trim && lastNonWhitespacePos >= 0)
? lastNonWhitespacePos + 1
: bufferPos;

// Safety check if everything was trimmed away
if (finalLength < 0)
{
finalLength = 0;
}

return new string(buffer, 0, finalLength);
}
}
15 changes: 10 additions & 5 deletions service/Core/DataFormats/Image/ImageDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using Microsoft.Extensions.Logging;
using Microsoft.KernelMemory.Diagnostics;
using Microsoft.KernelMemory.Pipeline;
using Microsoft.KernelMemory.Text;

namespace Microsoft.KernelMemory.DataFormats.Image;

Expand Down Expand Up @@ -64,7 +65,7 @@ public async Task<FileContent> DecodeAsync(Stream data, CancellationToken cancel

var result = new FileContent(MimeTypes.PlainText);
var content = await this.ImageToTextAsync(data, cancellationToken).ConfigureAwait(false);
result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
result.Sections.Add(new(content, 1, Chunk.Meta(sentencesAreComplete: true)));

return result;
}
Expand All @@ -87,10 +88,14 @@ private async Task<string> ImageToTextAsync(BinaryData data, CancellationToken c
}
}

private Task<string> ImageToTextAsync(Stream data, CancellationToken cancellationToken = default)
private async Task<string> ImageToTextAsync(Stream data, CancellationToken cancellationToken = default)
{
return this._ocrEngine is null
? throw new NotSupportedException($"Image extraction not configured")
: this._ocrEngine.ExtractTextFromImageAsync(data, cancellationToken);
if (this._ocrEngine is null)
{
throw new NotSupportedException($"Image extraction not configured");
}

string text = await this._ocrEngine.ExtractTextFromImageAsync(data, cancellationToken).ConfigureAwait(false);
return text.NormalizeNewlines(true);
}
}
2 changes: 1 addition & 1 deletion service/Core/DataFormats/Office/MsExcelDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
sb.AppendLineNix(this._config.EndOfWorksheetMarkerTemplate.Replace("{number}", $"{worksheetNumber}", StringComparison.OrdinalIgnoreCase));
}

string worksheetContent = sb.ToString().Trim();
string worksheetContent = sb.ToString().NormalizeNewlines(true);
sb.Clear();
result.Sections.Add(new Chunk(worksheetContent, worksheetNumber, Chunk.Meta(sentencesAreComplete: true)));
}
Expand Down
2 changes: 1 addition & 1 deletion service/Core/DataFormats/Office/MsPowerPointDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
}
}

string slideContent = sb.ToString().Trim();
string slideContent = sb.ToString().NormalizeNewlines(true);
sb.Clear();
result.Sections.Add(new Chunk(slideContent, slideNumber, Chunk.Meta(sentencesAreComplete: true)));
}
Expand Down
6 changes: 4 additions & 2 deletions service/Core/DataFormats/Office/MsWordDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
var lastRenderedPageBreak = p.GetFirstChild<Run>()?.GetFirstChild<LastRenderedPageBreak>();
if (lastRenderedPageBreak != null)
{
string pageContent = sb.ToString().Trim();
// Note: no trimming, use original spacing when working with pages
string pageContent = sb.ToString().NormalizeNewlines(false);
sb.Clear();
result.Sections.Add(new Chunk(pageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true)));
pageNumber++;
Expand All @@ -90,7 +91,8 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
}
}

var lastPageContent = sb.ToString().Trim();
// Note: no trimming, use original spacing when working with pages
string lastPageContent = sb.ToString().NormalizeNewlines(false);
result.Sections.Add(new Chunk(lastPageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true)));

return Task.FromResult(result);
Expand Down
6 changes: 4 additions & 2 deletions service/Core/DataFormats/Pdf/PdfDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
using Microsoft.Extensions.Logging;
using Microsoft.KernelMemory.Diagnostics;
using Microsoft.KernelMemory.Pipeline;
using Microsoft.KernelMemory.Text;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
Expand Down Expand Up @@ -56,8 +57,9 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation

foreach (Page? page in pdfDocument.GetPages().Where(x => x != null))
{
// Note: no trimming, use original spacing
string pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty;
// Note: no trimming, use original spacing when working with pages
string pageContent = ContentOrderTextExtractor.GetText(page).NormalizeNewlines(false) ?? string.Empty;

result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));
}

Expand Down
3 changes: 2 additions & 1 deletion service/Core/DataFormats/WebPages/HtmlDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
using Microsoft.Extensions.Logging;
using Microsoft.KernelMemory.Diagnostics;
using Microsoft.KernelMemory.Pipeline;
using Microsoft.KernelMemory.Text;

namespace Microsoft.KernelMemory.DataFormats.WebPages;

Expand Down Expand Up @@ -51,7 +52,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
var doc = new HtmlDocument();
doc.Load(data);

result.Sections.Add(new Chunk(doc.DocumentNode.InnerText.Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
result.Sections.Add(new Chunk(doc.DocumentNode.InnerText.NormalizeNewlines(true), 1, Chunk.Meta(sentencesAreComplete: true)));

return Task.FromResult(result);
}
Expand Down
75 changes: 75 additions & 0 deletions service/tests/Abstractions.UnitTests/Text/StringExtensionsTest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.KernelMemory.Text;

namespace Microsoft.KM.Abstractions.UnitTests.Text;

public class StringExtensionsTest
{
[Theory]
[Trait("Category", "UnitTest")]
[InlineData(null, null)]
[InlineData("", "")]
[InlineData(" ", " ")]
[InlineData("\n", "\n")]
[InlineData("\r", "\n")] // Old Mac
[InlineData("\r\n", "\n")] // Windows
[InlineData("\n\r", "\n\n")] // Not standard, that's 2 line endings
[InlineData("\n\n\n", "\n\n\n")]
[InlineData("\r\r\r", "\n\n\n")]
[InlineData("\r\r\n\r", "\n\n\n")]
[InlineData("\n\r\n\r", "\n\n\n")]
[InlineData("ciao", "ciao")]
[InlineData("ciao ", "ciao ")]
[InlineData(" ciao ", " ciao ")]
[InlineData("\r ciao ", "\n ciao ")]
[InlineData(" \rciao ", " \nciao ")]
[InlineData(" \r\nciao ", " \nciao ")]
[InlineData(" \r\nciao\n ", " \nciao\n ")]
[InlineData(" \r\nciao \n", " \nciao \n")]
[InlineData(" \r\nciao \r", " \nciao \n")]
[InlineData(" \r\nciao \rn", " \nciao \nn")]
public void ItNormalizesLineEndings(string? input, string? expected)
{
// Act
string actual = input.NormalizeNewlines();

Check failure on line 35 in service/tests/Abstractions.UnitTests/Text/StringExtensionsTest.cs

View workflow job for this annotation

GitHub Actions / Build (9.0.x, ubuntu-latest, Debug)

Possible null reference argument for parameter 'text' in 'string StringExtensions.NormalizeNewlines(string text, bool trim = false)'.

Check failure on line 35 in service/tests/Abstractions.UnitTests/Text/StringExtensionsTest.cs

View workflow job for this annotation

GitHub Actions / Build (9.0.x, ubuntu-latest, Debug)

Possible null reference argument for parameter 'text' in 'string StringExtensions.NormalizeNewlines(string text, bool trim = false)'.

Check warning on line 35 in service/tests/Abstractions.UnitTests/Text/StringExtensionsTest.cs

View workflow job for this annotation

GitHub Actions / Unit Tests (9.0.x, ubuntu-latest)

Possible null reference argument for parameter 'text' in 'string StringExtensions.NormalizeNewlines(string text, bool trim = false)'.

Check warning on line 35 in service/tests/Abstractions.UnitTests/Text/StringExtensionsTest.cs

View workflow job for this annotation

GitHub Actions / Unit Tests (9.0.x, ubuntu-latest)

Possible null reference argument for parameter 'text' in 'string StringExtensions.NormalizeNewlines(string text, bool trim = false)'.

// Assert
Assert.Equal(expected, actual);
}

[Theory]
[Trait("Category", "UnitTest")]
[InlineData(null, null)]
[InlineData("", "")]
[InlineData(" ", "")]
[InlineData("\n", "")]
[InlineData("\r", "")]
[InlineData("\r\n", "")]
[InlineData("\n\r", "")]
[InlineData("\n\n\n", "")]
[InlineData("\r\r\r", "")]
[InlineData("\r\r\n\r", "")]
[InlineData("\n\r\n\r", "")]
[InlineData("ciao", "ciao")]
[InlineData("ciao ", "ciao")]
[InlineData(" ciao ", "ciao")]
[InlineData("\r ciao ", "ciao")]
[InlineData(" \rciao ", "ciao")]
[InlineData(" \r\nciao ", "ciao")]
[InlineData(" \r\nciao\n ", "ciao")]
[InlineData(" \r\nciao \n", "ciao")]
[InlineData(" \r\nciao \r", "ciao")]
[InlineData(" \r\nciao \rn", "ciao \nn")]
[InlineData(" \r\nc\ri\ra\no \r", "c\ni\na\no")]
[InlineData(" \r\nc\r\ni\n\na\r\ro \r", "c\ni\n\na\n\no")]
[InlineData(" \r\nccc\r\ni\n\naaa\r\ro \r", "ccc\ni\n\naaa\n\no")]
public void ItCanTrimWhileNormalizingLineEndings(string? input, string? expected)
{
// Act
string actual = input.NormalizeNewlines(true);

Check failure on line 70 in service/tests/Abstractions.UnitTests/Text/StringExtensionsTest.cs

View workflow job for this annotation

GitHub Actions / Build (9.0.x, ubuntu-latest, Debug)

Possible null reference argument for parameter 'text' in 'string StringExtensions.NormalizeNewlines(string text, bool trim = false)'.

Check failure on line 70 in service/tests/Abstractions.UnitTests/Text/StringExtensionsTest.cs

View workflow job for this annotation

GitHub Actions / Build (9.0.x, ubuntu-latest, Debug)

Possible null reference argument for parameter 'text' in 'string StringExtensions.NormalizeNewlines(string text, bool trim = false)'.

Check warning on line 70 in service/tests/Abstractions.UnitTests/Text/StringExtensionsTest.cs

View workflow job for this annotation

GitHub Actions / Unit Tests (9.0.x, ubuntu-latest)

Possible null reference argument for parameter 'text' in 'string StringExtensions.NormalizeNewlines(string text, bool trim = false)'.

Check warning on line 70 in service/tests/Abstractions.UnitTests/Text/StringExtensionsTest.cs

View workflow job for this annotation

GitHub Actions / Unit Tests (9.0.x, ubuntu-latest)

Possible null reference argument for parameter 'text' in 'string StringExtensions.NormalizeNewlines(string text, bool trim = false)'.

// Assert
Assert.Equal(expected, actual);
}
}

0 comments on commit 13613e3

Please sign in to comment.