Skip to content

Commit a71e5b5

Browse files
makkomanJosemaPereiraHector Vela
authored
Feat: [Clon-22] Add search endpoint (#5)
* feat: linting code * feat: refactor * feat: add open search auth * chore: complement documentation * chore: fix default values * adds search functionality * Address self review comments --------- Co-authored-by: Josema Pereira Cih <[email protected]> Co-authored-by: Hector Vela <[email protected]>
1 parent 5008989 commit a71e5b5

13 files changed

+408
-57
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -158,4 +158,5 @@ cython_debug/
158158
# and can be added to the global gitignore or merged into this file. For a more nuclear
159159
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160160
#.idea/
161-
.envrc
161+
.envrc
162+
/.idea/*

README.md

+139
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,145 @@ flask run
7171

7272
[⇧ back to top](#table-of-contents)
7373

74+
### Opensearch index
75+
An opensearch index is required for running this service. You can create the index with the following mapping:
76+
77+
```
78+
// PUT /clone-vector-index
79+
{
80+
"aliases": {},
81+
"mappings": {
82+
"properties": {
83+
"content": {
84+
"type": "text",
85+
"fields": {
86+
"keyword": {
87+
"type": "keyword",
88+
"ignore_above": 256
89+
}
90+
}
91+
},
92+
"embedding": {
93+
"type": "knn_vector",
94+
"dimension": 384
95+
},
96+
"metadata": {
97+
"properties": {
98+
"_node_content": {
99+
"type": "text",
100+
"fields": {
101+
"keyword": {
102+
"type": "keyword",
103+
"ignore_above": 256
104+
}
105+
}
106+
},
107+
"_node_type": {
108+
"type": "text",
109+
"fields": {
110+
"keyword": {
111+
"type": "keyword",
112+
"ignore_above": 256
113+
}
114+
}
115+
},
116+
"doc_id": {
117+
"type": "text",
118+
"fields": {
119+
"keyword": {
120+
"type": "keyword",
121+
"ignore_above": 256
122+
}
123+
}
124+
},
125+
"document_id": {
126+
"type": "text",
127+
"fields": {
128+
"keyword": {
129+
"type": "keyword",
130+
"ignore_above": 256
131+
}
132+
}
133+
},
134+
"file_uuid": {
135+
"type": "text",
136+
"fields": {
137+
"keyword": {
138+
"type": "keyword",
139+
"ignore_above": 256
140+
}
141+
}
142+
},
143+
"processed_user": {
144+
"type": "text",
145+
"fields": {
146+
"keyword": {
147+
"type": "keyword",
148+
"ignore_above": 256
149+
}
150+
}
151+
},
152+
"raw_text": {
153+
"type": "text",
154+
"fields": {
155+
"keyword": {
156+
"type": "keyword",
157+
"ignore_above": 256
158+
}
159+
}
160+
},
161+
"ref_doc_id": {
162+
"type": "text",
163+
"fields": {
164+
"keyword": {
165+
"type": "keyword",
166+
"ignore_above": 256
167+
}
168+
}
169+
},
170+
"source_name": {
171+
"type": "text",
172+
"fields": {
173+
"keyword": {
174+
"type": "keyword",
175+
"ignore_above": 256
176+
}
177+
}
178+
},
179+
"twin_id": {
180+
"type": "text",
181+
"fields": {
182+
"keyword": {
183+
"type": "keyword",
184+
"ignore_above": 256
185+
}
186+
}
187+
},
188+
"user_name": {
189+
"type": "text",
190+
"fields": {
191+
"keyword": {
192+
"type": "keyword",
193+
"ignore_above": 256
194+
}
195+
}
196+
}
197+
}
198+
}
199+
}
200+
},
201+
"settings": {
202+
"index": {
203+
"replication": {
204+
"type": "DOCUMENT"
205+
},
206+
"number_of_shards": "1",
207+
"number_of_replicas": "1"
208+
}
209+
}
210+
}
211+
```
212+
74213
## Building the Docker Image
75214

76215
```Bash

compose.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ services:
1616
networks:
1717
- internal-net
1818
env_file: .env
19+
depends_on:
20+
localstack:
21+
condition: service_started
22+
opensearch:
23+
condition: service_started
24+
restart: true
1925

2026
localstack:
2127
image: localstack/localstack:latest

config.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@ class Config:
1717
LOG_LEVEL = environ.get("LOG_LEVEL")
1818
S3_BUCKET = environ.get("S3_BUCKET")
1919
OPENSEARCH_INDEX = environ.get("OPENSEARCH_INDEX")
20-
OPENSEARCH_CLUSTER_URL = environ.get("OPENSEARCH_CLUSTER_URL")
20+
OPENSEARCH_HOST = environ.get("OPENSEARCH_HOST")
21+
OPENSEARCH_PORT = environ.get("OPENSEARCH_PORT")
22+
OPENSEARCH_USE_SSL = environ.get("OPENSEARCH_USE_SSL")
23+
OPENSEARCH_VERIFY_CERTS = environ.get("OPENSEARCH_VERIFY_CERTS")
2124
IS_LOCAL = environ.get("IS_LOCAL")
2225
S3_URL = None
2326
S3_INDEX_PATH = environ.get("S3_INDEX_PATH")
@@ -34,9 +37,14 @@ class DevelopmentConfig(Config):
3437
LOG_LEVEL = "DEBUG"
3538
OPENSEARCH_CLUSTER_URL = "http://host.docker.internal:9200"
3639
OPENSEARCH_INDEX = "clone-vector-index"
37-
OPENSEARCH_USER = "clonAISearch"
38-
OPENSEARCH_PASS = "user"
39-
S3_BUCKET = "pass"
40+
# OPENSEARCH_HOST = "localhost"
41+
OPENSEARCH_HOST = "host.docker.internal"
42+
OPENSEARCH_PORT = "9200"
43+
OPENSEARCH_USER = ""
44+
OPENSEARCH_PASS = ""
45+
OPENSEARCH_USE_SSL = False
46+
OPENSEARCH_VERIFY_CERTS = False
47+
S3_BUCKET = "clone-ingestion-messages"
4048
IS_LOCAL = True
4149
S3_URL = "http://host.docker.internal:4566"
4250
AWS_ACCESS_KEY_ID = "test"

core/abstracts/services.py

+32-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ class AbstractS3Service(ABC):
88
"""
99

1010
@abstractmethod
11-
def get_object(self, bucket_name: str, object_key: str) -> dict:
11+
def get_object(self, bucket_name: str, object_key: str) -> list:
1212
"""
1313
Abstract method to get an object from S3.
1414
@@ -44,3 +44,34 @@ def vector_store_index(
4444
str: Index summary
4545
"""
4646
pass
47+
48+
@abstractmethod
49+
def vectorize_string(self, text_input: str) -> list:
50+
"""
51+
Abstract method to indexing documents and store vectors in OpenSearch.
52+
53+
Args:
54+
text_input (str): A string to vectorize
55+
56+
Returns:
57+
list: a list of float values representing a vector
58+
"""
59+
pass
60+
61+
62+
class AbstractOpensearchService(ABC):
63+
"""
64+
Abstract class for Opensearch services
65+
"""
66+
@abstractmethod
67+
def search(self, query: dict) -> list:
68+
"""
69+
Abstract method to query an opensearch index
70+
71+
Args:
72+
query (dict): Opensearch DSL query string
73+
74+
Returns:
75+
list: a list of results
76+
"""
77+
pass

core/abstracts/usescases.py

+14
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from abc import ABC, abstractmethod
2+
from typing import Any
23

34

45
class AbstractVectorizeUsecase(ABC):
@@ -20,3 +21,16 @@ def vectorize_and_index(self, bucket_name: str, object_key: str) -> str:
2021
str: The indexed document.
2122
"""
2223
pass
24+
25+
@abstractmethod
26+
def search(self, query: str) -> list[dict[str, Any]]:
27+
"""
28+
Abstract method to search for indexed documents.
29+
30+
Args:
31+
query (str): The text to search documents containing the query text.
32+
33+
Returns:
34+
list[dict[str, Any]]: The list of results
35+
"""
36+
pass

core/controller/vector.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1+
from http import HTTPStatus
12
from logging import Logger
23
from typing import Any, Dict, Tuple
34

4-
from flask import jsonify
5+
from flask import jsonify, Response
56

67
from core.abstracts.controller import AbstractVectorController
78
from core.abstracts.usescases import AbstractVectorizeUsecase
@@ -22,7 +23,7 @@ def __init__(self, usecase: AbstractVectorizeUsecase, logger: Logger):
2223
self.usecase = usecase
2324
self.logger = logger
2425

25-
def vectoring(self, request: Dict[str, Any]) -> Tuple[Dict[str, str], int]:
26+
def vectoring(self, request: Dict[str, Any]) -> Tuple[Response, int]:
2627
"""
2728
Handle vectorization requests.
2829
@@ -41,7 +42,18 @@ def vectoring(self, request: Dict[str, Any]) -> Tuple[Dict[str, str], int]:
4142

4243
try:
4344
self.usecase.vectorize_and_index(s3_bucket, s3_object_key)
44-
return jsonify({"message": "Object vectorization succeeded!"}), 200
45+
return jsonify({"message": "Object vectorization succeeded!"}), HTTPStatus.OK
4546
except Exception as e:
4647
self.logger.error(f"Failed to vectorize object {s3_object_key}")
47-
return jsonify({"error": str(e)}), 500
48+
return jsonify({"error": str(e)}), HTTPStatus.INTERNAL_SERVER_ERROR
49+
50+
def search(self, request: Dict[str, Any]) -> Tuple[Response, int]:
51+
query = request["q"]
52+
if query is None or query.strip() == "":
53+
return jsonify({'error': 'query param "q" is required'}), HTTPStatus.BAD_REQUEST
54+
55+
try:
56+
result = self.usecase.search(query)
57+
return jsonify({'results': result}), HTTPStatus.OK
58+
except Exception as e:
59+
return jsonify({'error': str(e)}), HTTPStatus.INTERNAL_SERVER_ERROR

core/service/llama_index_service.py

+19-28
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,7 @@
22

33
from llama_index.core import Document, StorageContext, VectorStoreIndex
44
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
5-
from llama_index.vector_stores.opensearch import (
6-
OpensearchVectorClient,
7-
OpensearchVectorStore,
8-
)
5+
from llama_index.vector_stores.opensearch import OpensearchVectorStore
96

107
from core.abstracts.services import AbstractLlamaIndexService
118
from core.utils import utils
@@ -22,43 +19,25 @@ class LlamaIndexService(AbstractLlamaIndexService):
2219
"""
2320

2421
def __init__(
25-
self,
26-
open_search_url: str,
27-
open_search_index: str,
28-
open_search_user: str,
29-
open_search_password: str,
30-
logger: Logger,
22+
self,
23+
vector_store: OpensearchVectorStore,
24+
logger: Logger,
3125
):
3226
"""
3327
Initialize the LlamaIndexService.
3428
3529
Args:
36-
open_search_url (str): URL of the OpenSearch instance.
37-
open_search_index (str): Name of the index in OpenSearch where vectors are stored.
38-
open_search_user (str): Username for OpenSearch.
39-
open_search_password (str): Password for OpenSearch.
30+
vector_store (OpensearchVectorStore): Elasticsearch/Opensearch vector store instance
4031
logger (Logger): Logger instance.
4132
"""
4233
self.logger = logger
4334

4435
self.logger.info("Initializing LlamaIndexService...")
45-
self.client = OpensearchVectorClient(
46-
endpoint=open_search_url,
47-
index=open_search_index,
48-
dim=384,
49-
embedding_field=EMBEDDING_FIELD,
50-
text_field=TEXT_FIELD,
51-
http_auth=(open_search_user, open_search_password),
52-
)
53-
54-
self.vector_store = OpensearchVectorStore(self.client)
55-
self.storage_context = StorageContext.from_defaults(
56-
vector_store=self.vector_store
57-
)
36+
self.storage_context = StorageContext.from_defaults(vector_store=vector_store)
5837
self.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
5938

6039
def vector_store_index(
61-
self, twin_id: str, source_name: str, file_uuid: str, documents: list
40+
self, twin_id: str, source_name: str, file_uuid: str, documents: list
6241
) -> str:
6342
"""
6443
Index documents and store vectors in OpenSearch.
@@ -109,3 +88,15 @@ def vector_store_index(
10988
message_error = f"Error while indexing documents for {twin_id}/{source_name}/{file_uuid}"
11089
self.logger.error(e)
11190
raise ValueError(message_error)
91+
92+
def vectorize_string(self, text_input: str) -> list:
93+
"""
94+
Retrieves the embedded value (vector) for the text_input string
95+
96+
Args:
97+
text_input (str): Text to vectorize
98+
99+
Returns:
100+
list: a list of float values representing the text_input vector
101+
"""
102+
return self.embed_model.get_text_embedding(text_input)

0 commit comments

Comments
 (0)