Skip to content

Commit d1a503d

Browse files
authored
enhance: add file source and location metadata to knowledge results (#428)
1 parent df8e686 commit d1a503d

File tree

2 files changed

+73
-26
lines changed

2 files changed

+73
-26
lines changed

knowledge/pkg/datastore/retrieve.go

+9-1
Original file line numberDiff line numberDiff line change
@@ -100,5 +100,13 @@ func (s *Datastore) SimilaritySearch(ctx context.Context, query string, numDocum
100100
}
101101
}
102102
}
103-
return s.Vectorstore.SimilaritySearch(ctx, query, numDocuments, datasetID, where, whereDocument, ef)
103+
docs, err := s.Vectorstore.SimilaritySearch(ctx, query, numDocuments, datasetID, where, whereDocument, ef)
104+
if err != nil {
105+
return nil, err
106+
}
107+
for i, doc := range docs {
108+
doc.Metadata["datasetID"] = datasetID
109+
docs[i] = doc
110+
}
111+
return docs, nil
104112
}

result-formatter/main.go

+64-25
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7+
"log/slog"
8+
neturl "net/url"
79
"os"
8-
"strconv"
910
"strings"
1011
"sync"
1112

@@ -22,14 +23,27 @@ type subqueryResults struct {
2223
}
2324

2425
type document struct {
25-
ID string `json:"id"`
26-
Content string `json:"content,omitempty"`
27-
Metadata map[string]any `json:"metadata,omitempty"`
26+
ID string `json:"id"`
27+
Content string `json:"content,omitempty"`
28+
Metadata metadata `json:"metadata,omitempty"`
29+
}
30+
31+
type metadata struct {
32+
Source string `json:"source,omitempty"`
33+
WorkspaceID string `json:"workspaceID,omitempty"`
34+
URL string `json:"url,omitempty"`
35+
Pages string `json:"pages,omitempty"`
36+
Page int `json:"page,omitempty"`
37+
TotalPages int `json:"totalPages,omitempty"`
38+
FileSize int `json:"fileSize,omitempty"`
39+
WorkspaceFileName string `json:"workspaceFileName,omitempty"` // workspaceFileName is the location of the converted file, not the original file - e.g. <path>/foo.pdf.json
40+
DatasetID string `json:"datasetID,omitempty"`
2841
}
2942

3043
type hit struct {
31-
URL string `json:"url,omitempty"`
32-
Content string `json:"content,omitempty"`
44+
URL string `json:"url,omitempty"` // URL should be the original source of the document (Web URL, OneDrive Link, etc.)
45+
Location string `json:"location,omitempty"` // Location should be the location of the result in the original source (page numbers, etc.)
46+
Content string `json:"content,omitempty"` // Content should be the text content of the document
3347
}
3448

3549
type inputContent struct {
@@ -44,22 +58,22 @@ func main() {
4458
ctx = context.Background()
4559
)
4660

47-
// This is ugly code, I know. Beauty comes later.
61+
// This is ugly code, I know. Beauty comes later. Cleaned up a little. Still room for improvement.
4862

4963
if clientErr != nil {
50-
_, _ = fmt.Fprintf(os.Stderr, "failed to create gptscript client: %v\n", clientErr)
64+
slog.Error("failed to create gptscript client", "error", clientErr)
5165
}
5266

5367
if err := json.Unmarshal([]byte(out), &output); err != nil {
54-
_, _ = fmt.Fprintf(os.Stderr, "failed to unmarshal output: %v\n", err)
68+
slog.Debug("failed to unmarshal output", "err", err)
5569
fmt.Print(out)
5670
return
5771
}
5872

5973
var (
6074
outDocs []hit
6175
wg sync.WaitGroup
62-
fullyFetched = map[string]struct{}{}
76+
fullyFetched = map[string]int{} // fullyFetched is a map of files that have been fully fetched from the workspace - the value is the index in outDocs
6377
budget = 120_000
6478
)
6579

@@ -68,27 +82,51 @@ func main() {
6882
break
6983
}
7084
for _, doc := range result.ResultDocuments {
71-
filename, _ := doc.Metadata["workspaceFileName"].(string)
72-
if _, ok := fullyFetched[filename]; ok {
85+
filename := doc.Metadata.WorkspaceFileName
86+
87+
// We parse the location regardless of the file potentially being fully fetched already to preserve the
88+
// source reference metadata (i.e. where in the document the information was found).
89+
// This is a UX thing to help users with manual proofreading of answers.
90+
var location string
91+
if doc.Metadata.Pages != "" {
92+
location = "Pages " + doc.Metadata.Pages
93+
} else if doc.Metadata.Page > 0 {
94+
location = fmt.Sprintf("Page %d", doc.Metadata.Page)
95+
}
96+
if location != "" && doc.Metadata.TotalPages > 0 {
97+
location = fmt.Sprintf("%s of %d", location, doc.Metadata.TotalPages)
98+
slog.Debug("result doc in file", "filename", filename, "location", location)
99+
}
100+
101+
if ffi, ok := fullyFetched[filename]; ok {
102+
if location != "" {
103+
outDocs[ffi].Location += " and " + location
104+
}
73105
continue
74106
}
75107

76-
url, _ := doc.Metadata["url"].(string)
108+
var url string
109+
if doc.Metadata.URL != "" {
110+
url = doc.Metadata.URL
111+
} else if doc.Metadata.Source != "" {
112+
url = "knowledge://" + neturl.PathEscape(doc.Metadata.DatasetID+"::"+strings.TrimPrefix(doc.Metadata.Source, "ws://")) // <datasetID>::<workspaceFileName>, where datasetID is <namespace>/<knowledgeset>
113+
}
114+
77115
outDocs = append(outDocs, hit{
78-
URL: url,
79-
Content: doc.Content,
116+
URL: url,
117+
Content: doc.Content,
118+
Location: location,
80119
})
81120

82121
index := len(outDocs) - 1
83122

84123
if index < 3 && clientErr == nil {
85-
fileSize, _ := doc.Metadata["fileSize"].(string)
86-
size, _ := strconv.Atoi(fileSize)
87-
workspaceID, _ := doc.Metadata["workspaceID"].(string)
88-
if size > 5_000 && size < budget && workspaceID != "" {
89-
_, _ = fmt.Fprintf(os.Stderr, "reading file in workspace: %s\n", filename)
90-
fullyFetched[filename] = struct{}{}
91-
budget -= size
124+
fileSize := doc.Metadata.FileSize
125+
workspaceID := doc.Metadata.WorkspaceID
126+
if fileSize > 5_000 && fileSize < budget && workspaceID != "" {
127+
slog.Debug("fetching full file from workspace", "file", filename, "sizeInBytes", fileSize)
128+
fullyFetched[filename] = index
129+
budget -= fileSize
92130
wg.Add(1)
93131

94132
go func() {
@@ -98,13 +136,13 @@ func main() {
98136
WorkspaceID: workspaceID,
99137
})
100138
if err != nil {
101-
_, _ = fmt.Fprintf(os.Stderr, "failed to read file in workspace: %v\n", err)
139+
slog.Error("failed to read file in workspace", "error", err)
102140
return
103141
}
104142

105143
var sourceContent inputContent
106144
if err := json.Unmarshal(content, &sourceContent); err != nil {
107-
_, _ = fmt.Fprintf(os.Stderr, "failed to unmarshal content: %v\n", err)
145+
slog.Error("failed to unmarshal content", "error", err)
108146
return
109147
}
110148

@@ -115,10 +153,11 @@ func main() {
115153

116154
if buffer.Len() > 0 {
117155
outDocs[index].Content = buffer.String()
156+
outDocs[index].Location = "Full Document. Specifically " + outDocs[index].Location
118157
}
119158
}()
120159
} else {
121-
_, _ = fmt.Fprintf(os.Stderr, "file size is not within the range: %s %s %d %d\n", workspaceID, filename, size, budget)
160+
slog.Debug("filesize is not within range", "filename", fmt.Sprintf("%s/%s", workspaceID, filename), "filesize", fileSize, "budget", budget)
122161
}
123162
}
124163
}

0 commit comments

Comments
 (0)