-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathvectorstore.go
116 lines (95 loc) · 2.75 KB
/
vectorstore.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
package gptbot
import (
"context"
"encoding/json"
"os"
"golang.org/x/exp/maps"
"golang.org/x/exp/slices"
"gonum.org/v1/gonum/mat"
)
type LocalVectorStore struct {
chunks map[string][]*Chunk
}
func NewLocalVectorStore() *LocalVectorStore {
return &LocalVectorStore{
chunks: make(map[string][]*Chunk),
}
}
// LoadJSON will deserialize from disk into a `LocalVectorStore` based on the provided filename.
func (vs *LocalVectorStore) LoadJSON(ctx context.Context, filename string) error {
data, err := os.ReadFile(filename)
if err != nil {
return err
}
var chunks []*Chunk
if err := json.Unmarshal(data, &chunks); err != nil {
return err
}
chunkMap := make(map[string][]*Chunk)
for _, chunk := range chunks {
chunkMap[chunk.DocumentID] = append(chunkMap[chunk.DocumentID], chunk)
}
return vs.Insert(ctx, chunkMap)
}
// StoreJSON will serialize the `LocalVectorStore` to disk based on the provided filename.
func (vs *LocalVectorStore) StoreJSON(filename string) error {
var chunks []*Chunk
for _, chunk := range vs.chunks {
chunks = append(chunks, chunk...)
}
b, err := json.Marshal(chunks)
if err != nil {
return err
}
err = os.WriteFile(filename, b, 0666)
if err != nil {
return err
}
return nil
}
// GetAllData returns all the internal data. It is mainly used for testing purpose.
func (vs *LocalVectorStore) GetAllData(ctx context.Context) map[string][]*Chunk {
return vs.chunks
}
func (vs *LocalVectorStore) Insert(ctx context.Context, chunks map[string][]*Chunk) error {
for documentID, chunkList := range chunks {
vs.chunks[documentID] = append(vs.chunks[documentID], chunkList...)
}
return nil
}
func (vs *LocalVectorStore) Query(ctx context.Context, embedding Embedding, corpusID string, topK int) ([]*Similarity, error) {
if topK <= 0 {
return nil, nil
}
target := mat.NewVecDense(len(embedding), embedding)
var similarities []*Similarity
for _, chunks := range vs.chunks {
for _, chunk := range chunks {
candidate := mat.NewVecDense(len(chunk.Embedding), chunk.Embedding)
score := mat.Dot(target, candidate)
similarities = append(similarities, &Similarity{
Chunk: chunk,
Score: score,
})
}
}
// Sort similarities by score in descending order.
slices.SortStableFunc(similarities, func(a, b *Similarity) bool {
return a.Score > b.Score
})
if len(similarities) <= topK {
return similarities, nil
}
return similarities[:topK], nil
}
// Delete deletes the chunks belonging to the given documentIDs.
// As a special case, empty documentIDs means deleting all chunks.
func (vs *LocalVectorStore) Delete(ctx context.Context, documentIDs ...string) error {
if len(documentIDs) == 0 {
maps.Clear(vs.chunks)
}
for _, documentID := range documentIDs {
delete(vs.chunks, documentID)
}
return nil
}