From 17d73e0b2c7249375d327c6be53f6cde1ca7616c Mon Sep 17 00:00:00 2001 From: Michael Keller Date: Tue, 28 May 2024 00:12:07 +0200 Subject: [PATCH] Update SearchClient to use webPageUrl instead of static fileName for webpages (fixes microsoft/kernel-memory#491) (#521) ## Motivation and Context (Why the change? What's the scenario?) When providing webpages as facts, the "filename" currently is a static "content.url" - this provides no value when asking the LLM to include sources directly in the response (e.g. to have per paragraph sources). Update SearchClient to use webPageUrl instead of static fileName for webpages. ## High level description (Approach, Design) When creating the facts, instead of "content.url" the webpage url is added Co-authored-by: Michael Keller --- service/Core/Search/SearchClient.cs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/service/Core/Search/SearchClient.cs b/service/Core/Search/SearchClient.cs index 88f746531..98a66fcb7 100644 --- a/service/Core/Search/SearchClient.cs +++ b/service/Core/Search/SearchClient.cs @@ -238,6 +238,8 @@ public async Task AskAsync( string fileName = memory.GetFileName(this._log); + string webPageUrl = memory.GetWebPageUrl(index); + var partitionText = memory.GetPartitionText(this._log).Trim(); if (string.IsNullOrEmpty(partitionText)) { @@ -248,7 +250,7 @@ public async Task AskAsync( factsAvailableCount++; // TODO: add file age in days, to push relevance of newer documents - var fact = $"==== [File:{fileName};Relevance:{relevance:P1}]:\n{partitionText}\n"; + var fact = $"==== [File:{(fileName == "content.url" ? webPageUrl : fileName)};Relevance:{relevance:P1}]:\n{partitionText}\n"; // Use the partition/chunk only if there's room for it var size = this._textGenerator.CountTokens(fact);