Skip to content

Commit bb1630b

Browse files
authored
Add PGVector Support (#94)
* feat: add pgvector * feat: delete summary index too
1 parent ea5e041 commit bb1630b

File tree

7 files changed

+241
-2
lines changed

7 files changed

+241
-2
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,5 +166,5 @@ Super-Rag comes with a built in REST API powered by FastApi.
166166
- Qdrant
167167
- Weaviate
168168
- Astra
169-
- Supabase (coming soon)
169+
- PGVector
170170
- Chroma (coming soon)

api/delete.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from models.delete import RequestPayload, ResponsePayload
44
from vectordbs import get_vector_service
55
from vectordbs.base import BaseVectorDatabase
6+
from utils.summarise import SUMMARY_SUFFIX
67

78
router = APIRouter()
89

@@ -16,8 +17,15 @@ async def delete(payload: RequestPayload):
1617
encoder=encoder,
1718
dimensions=payload.encoder.dimensions,
1819
)
20+
summary_vector_service: BaseVectorDatabase = get_vector_service(
21+
index_name=f"{payload.index_name}{SUMMARY_SUFFIX}",
22+
credentials=payload.vector_database,
23+
encoder=encoder,
24+
dimensions=payload.encoder.dimensions,
25+
)
1926

2027
for file in payload.files:
2128
data = await vector_service.delete(file_url=file.url)
29+
await summary_vector_service.delete(file_url=file.url)
2230

2331
return ResponsePayload(success=True, data=data)

models/vector_database.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ class DatabaseType(Enum):
99
pinecone = "pinecone"
1010
weaviate = "weaviate"
1111
astra = "astra"
12+
pgvector = "pgvector"
1213

1314

1415
class VectorDatabase(BaseModel):

poetry.lock

Lines changed: 133 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ gunicorn = "^21.2.0"
3333
unstructured-client = "^0.18.0"
3434
unstructured = {extras = ["google-drive"], version = "^0.12.4"}
3535
tiktoken = "^0.6.0"
36+
vecs = "^0.4.3"
3637

3738
[tool.poetry.group.dev.dependencies]
3839
termcolor = "^2.4.0"

vectordbs/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from vectordbs.pinecone import PineconeService
1111
from vectordbs.qdrant import QdrantService
1212
from vectordbs.weaviate import WeaviateService
13+
from vectordbs.pgvector import PGVectorService
1314

1415
load_dotenv()
1516

@@ -26,6 +27,7 @@ def get_vector_service(
2627
"qdrant": QdrantService,
2728
"weaviate": WeaviateService,
2829
"astra": AstraService,
30+
"pgvector": PGVectorService,
2931
# Add other providers here
3032
# e.g "weaviate": WeaviateVectorService,
3133
}

vectordbs/pgvector.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
from typing import List
2+
3+
import vecs
4+
from semantic_router.encoders import BaseEncoder
5+
from tqdm import tqdm
6+
7+
from qdrant_client.http import models as rest
8+
from models.delete import DeleteResponse
9+
from models.document import BaseDocumentChunk
10+
from vectordbs.base import BaseVectorDatabase
11+
12+
MAX_QUERY_TOP_K = 5
13+
14+
15+
class PGVectorService(BaseVectorDatabase):
16+
def __init__(
17+
self, index_name: str, dimension: int, credentials: dict, encoder: BaseEncoder
18+
):
19+
super().__init__(
20+
index_name=index_name,
21+
dimension=dimension,
22+
credentials=credentials,
23+
encoder=encoder,
24+
)
25+
client = vecs.create_client(connection_string=credentials["database_uri"])
26+
self.collection = client.get_or_create_collection(
27+
name=self.index_name,
28+
dimension=dimension,
29+
)
30+
31+
# TODO: remove this
32+
async def convert_to_rerank_format(self, chunks: List[rest.PointStruct]):
33+
docs = [
34+
{
35+
"content": chunk.payload.get("content"),
36+
"page_label": chunk.payload.get("page_label"),
37+
"file_url": chunk.payload.get("file_url"),
38+
}
39+
for chunk in chunks
40+
]
41+
return docs
42+
43+
async def upsert(self, chunks: List[BaseDocumentChunk]) -> None:
44+
records = []
45+
for chunk in tqdm(chunks, desc="Upserting to PGVector"):
46+
records.append(
47+
(
48+
chunk.id,
49+
chunk.dense_embedding,
50+
{
51+
"document_id": chunk.document_id,
52+
"content": chunk.content,
53+
"doc_url": chunk.doc_url,
54+
**(chunk.metadata if chunk.metadata else {}),
55+
},
56+
)
57+
)
58+
self.collection.upsert(records)
59+
self.collection.create_index()
60+
61+
async def query(self, input: str, top_k: int = MAX_QUERY_TOP_K) -> List:
62+
vectors = await self._generate_vectors(input=input)
63+
64+
results = self.collection.query(
65+
data=vectors[0],
66+
limit=top_k,
67+
include_metadata=True,
68+
include_value=False,
69+
)
70+
71+
chunks = []
72+
73+
for result in results:
74+
(
75+
id,
76+
metadata,
77+
) = result
78+
79+
chunks.append(
80+
BaseDocumentChunk(
81+
id=id,
82+
source_type=metadata.get("filetype"),
83+
source=metadata.get("doc_url"),
84+
document_id=metadata.get("document_id"),
85+
content=metadata.get("content"),
86+
doc_url=metadata.get("doc_url"),
87+
page_number=metadata.get("page_number"),
88+
metadata={**metadata},
89+
)
90+
)
91+
return chunks
92+
93+
async def delete(self, file_url: str) -> None:
94+
deleted = self.collection.delete(filters={"doc_url": {"$eq": file_url}})
95+
return DeleteResponse(num_of_deleted_chunks=len(deleted))

0 commit comments

Comments
 (0)