wizeline
diff --git a/‎.gitignore
+2-1 b/‎.gitignore
+2-1
diff --git a/‎README.md
+139 b/‎README.md
+139
diff --git a/‎compose.yaml
+6 b/‎compose.yaml
+6
diff --git a/‎config.py
+12-4 b/‎config.py
+12-4
diff --git a/‎core/abstracts/services.py
+32-1 b/‎core/abstracts/services.py
+32-1
diff --git a/‎core/abstracts/usescases.py
+14 b/‎core/abstracts/usescases.py
+14
diff --git a/‎core/controller/vector.py
+16-4 b/‎core/controller/vector.py
+16-4
diff --git a/‎core/service/llama_index_service.py
+19-28 b/‎core/service/llama_index_service.py
+19-28
@@ -158,4 +158,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-.envrc
+.envrc
+/.idea/*
@@ -71,6 +71,145 @@ flask run
 
 [⇧ back to top](#table-of-contents)
 
+### Opensearch index
+An opensearch index is required for running this service. You can create the index with the following mapping:
+
+```
+// PUT /clone-vector-index 
+{
+    "aliases": {},
+    "mappings": {
+        "properties": {
+            "content": {
+                "type": "text",
+                "fields": {
+                    "keyword": {
+                        "type": "keyword",
+                        "ignore_above": 256
+                    }
+                }
+            },
+            "embedding": {
+                "type": "knn_vector",
+                "dimension": 384
+            },
+            "metadata": {
+                "properties": {
+                    "_node_content": {
+                        "type": "text",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword",
+                                "ignore_above": 256
+                            }
+                        }
+                    },
+                    "_node_type": {
+                        "type": "text",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword",
+                                "ignore_above": 256
+                            }
+                        }
+                    },
+                    "doc_id": {
+                        "type": "text",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword",
+                                "ignore_above": 256
+                            }
+                        }
+                    },
+                    "document_id": {
+                        "type": "text",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword",
+                                "ignore_above": 256
+                            }
+                        }
+                    },
+                    "file_uuid": {
+                        "type": "text",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword",
+                                "ignore_above": 256
+                            }
+                        }
+                    },
+                    "processed_user": {
+                        "type": "text",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword",
+                                "ignore_above": 256
+                            }
+                        }
+                    },
+                    "raw_text": {
+                        "type": "text",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword",
+                                "ignore_above": 256
+                            }
+                        }
+                    },
+                    "ref_doc_id": {
+                        "type": "text",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword",
+                                "ignore_above": 256
+                            }
+                        }
+                    },
+                    "source_name": {
+                        "type": "text",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword",
+                                "ignore_above": 256
+                            }
+                        }
+                    },
+                    "twin_id": {
+                        "type": "text",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword",
+                                "ignore_above": 256
+                            }
+                        }
+                    },
+                    "user_name": {
+                        "type": "text",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword",
+                                "ignore_above": 256
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    },
+    "settings": {
+        "index": {
+            "replication": {
+                "type": "DOCUMENT"
+            },
+            "number_of_shards": "1",
+            "number_of_replicas": "1"
+        }
+    }
+}
+```
+
 ## Building the Docker Image
 
 ```Bash
 
@@ -16,6 +16,12 @@ services:
     networks:
       - internal-net
     env_file: .env
+    depends_on:
+      localstack:
+        condition: service_started
+      opensearch:
+        condition: service_started
+        restart: true
 
   localstack:
     image: localstack/localstack:latest
 
@@ -17,7 +17,10 @@ class Config:
     LOG_LEVEL = environ.get("LOG_LEVEL")
     S3_BUCKET = environ.get("S3_BUCKET")
     OPENSEARCH_INDEX = environ.get("OPENSEARCH_INDEX")
-    OPENSEARCH_CLUSTER_URL = environ.get("OPENSEARCH_CLUSTER_URL")
+    OPENSEARCH_HOST = environ.get("OPENSEARCH_HOST")
+    OPENSEARCH_PORT = environ.get("OPENSEARCH_PORT")
+    OPENSEARCH_USE_SSL = environ.get("OPENSEARCH_USE_SSL")
+    OPENSEARCH_VERIFY_CERTS = environ.get("OPENSEARCH_VERIFY_CERTS")
     IS_LOCAL = environ.get("IS_LOCAL")
     S3_URL = None
     S3_INDEX_PATH = environ.get("S3_INDEX_PATH")
@@ -34,9 +37,14 @@ class DevelopmentConfig(Config):
     LOG_LEVEL = "DEBUG"
     OPENSEARCH_CLUSTER_URL = "http://host.docker.internal:9200"
     OPENSEARCH_INDEX = "clone-vector-index"
-    OPENSEARCH_USER = "clonAISearch"
-    OPENSEARCH_PASS = "user"
-    S3_BUCKET = "pass"
+    # OPENSEARCH_HOST = "localhost"
+    OPENSEARCH_HOST = "host.docker.internal"
+    OPENSEARCH_PORT = "9200"
+    OPENSEARCH_USER = ""
+    OPENSEARCH_PASS = ""
+    OPENSEARCH_USE_SSL = False
+    OPENSEARCH_VERIFY_CERTS = False
+    S3_BUCKET = "clone-ingestion-messages"
     IS_LOCAL = True
     S3_URL = "http://host.docker.internal:4566"
     AWS_ACCESS_KEY_ID = "test"
 
@@ -8,7 +8,7 @@ class AbstractS3Service(ABC):
     """
 
     @abstractmethod
-    def get_object(self, bucket_name: str, object_key: str) -> dict:
+    def get_object(self, bucket_name: str, object_key: str) -> list:
         """
         Abstract method to get an object from S3.
 
@@ -44,3 +44,34 @@ def vector_store_index(
             str: Index summary
         """
         pass
+
+    @abstractmethod
+    def vectorize_string(self, text_input: str) -> list:
+        """
+            Abstract method to indexing documents and store vectors in OpenSearch.
+
+            Args:
+                text_input (str): A string to vectorize
+
+            Returns:
+                list: a list of float values representing a vector
+            """
+        pass
+
+
+class AbstractOpensearchService(ABC):
+    """
+       Abstract class for Opensearch services
+   """
+    @abstractmethod
+    def search(self, query: dict) -> list:
+        """
+        Abstract method to query an opensearch index
+
+        Args:
+            query (dict): Opensearch DSL query string
+
+        Returns:
+            list: a list of results
+        """
+        pass
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from typing import Any
 
 
 class AbstractVectorizeUsecase(ABC):
@@ -20,3 +21,16 @@ def vectorize_and_index(self, bucket_name: str, object_key: str) -> str:
             str: The indexed document.
         """
         pass
+
+    @abstractmethod
+    def search(self, query: str) -> list[dict[str, Any]]:
+        """
+        Abstract method to search for indexed documents.
+
+        Args:
+            query (str): The text to search documents containing the query text.
+
+        Returns:
+            list[dict[str, Any]]: The list of results
+        """
+        pass
@@ -1,7 +1,8 @@
+from http import HTTPStatus
 from logging import Logger
 from typing import Any, Dict, Tuple
 
-from flask import jsonify
+from flask import jsonify, Response
 
 from core.abstracts.controller import AbstractVectorController
 from core.abstracts.usescases import AbstractVectorizeUsecase
@@ -22,7 +23,7 @@ def __init__(self, usecase: AbstractVectorizeUsecase, logger: Logger):
         self.usecase = usecase
         self.logger = logger
 
-    def vectoring(self, request: Dict[str, Any]) -> Tuple[Dict[str, str], int]:
+    def vectoring(self, request: Dict[str, Any]) -> Tuple[Response, int]:
         """
         Handle vectorization requests.
 
@@ -41,7 +42,18 @@ def vectoring(self, request: Dict[str, Any]) -> Tuple[Dict[str, str], int]:
 
         try:
             self.usecase.vectorize_and_index(s3_bucket, s3_object_key)
-            return jsonify({"message": "Object vectorization succeeded!"}), 200
+            return jsonify({"message": "Object vectorization succeeded!"}), HTTPStatus.OK
         except Exception as e:
             self.logger.error(f"Failed to vectorize object {s3_object_key}")
-            return jsonify({"error": str(e)}), 500
+            return jsonify({"error": str(e)}), HTTPStatus.INTERNAL_SERVER_ERROR
+
+    def search(self, request: Dict[str, Any]) -> Tuple[Response, int]:
+        query = request["q"]
+        if query is None or query.strip() == "":
+            return jsonify({'error': 'query param "q" is required'}), HTTPStatus.BAD_REQUEST
+
+        try:
+            result = self.usecase.search(query)
+            return jsonify({'results': result}), HTTPStatus.OK
+        except Exception as e:
+            return jsonify({'error': str(e)}), HTTPStatus.INTERNAL_SERVER_ERROR
@@ -2,10 +2,7 @@
 
 from llama_index.core import Document, StorageContext, VectorStoreIndex
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-from llama_index.vector_stores.opensearch import (
-    OpensearchVectorClient,
-    OpensearchVectorStore,
-)
+from llama_index.vector_stores.opensearch import OpensearchVectorStore
 
 from core.abstracts.services import AbstractLlamaIndexService
 from core.utils import utils
@@ -22,43 +19,25 @@ class LlamaIndexService(AbstractLlamaIndexService):
     """
 
     def __init__(
-        self,
-        open_search_url: str,
-        open_search_index: str,
-        open_search_user: str,
-        open_search_password: str,
-        logger: Logger,
+            self,
+            vector_store: OpensearchVectorStore,
+            logger: Logger,
     ):
         """
         Initialize the LlamaIndexService.
 
         Args:
-            open_search_url (str): URL of the OpenSearch instance.
-            open_search_index (str): Name of the index in OpenSearch where vectors are stored.
-            open_search_user (str): Username for OpenSearch.
-            open_search_password (str): Password for OpenSearch.
+            vector_store (OpensearchVectorStore): Elasticsearch/Opensearch vector store instance
             logger (Logger): Logger instance.
         """
         self.logger = logger
 
         self.logger.info("Initializing LlamaIndexService...")
-        self.client = OpensearchVectorClient(
-            endpoint=open_search_url,
-            index=open_search_index,
-            dim=384,
-            embedding_field=EMBEDDING_FIELD,
-            text_field=TEXT_FIELD,
-            http_auth=(open_search_user, open_search_password),
-        )
-
-        self.vector_store = OpensearchVectorStore(self.client)
-        self.storage_context = StorageContext.from_defaults(
-            vector_store=self.vector_store
-        )
+        self.storage_context = StorageContext.from_defaults(vector_store=vector_store)
         self.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
 
     def vector_store_index(
-        self, twin_id: str, source_name: str, file_uuid: str, documents: list
+            self, twin_id: str, source_name: str, file_uuid: str, documents: list
     ) -> str:
         """
         Index documents and store vectors in OpenSearch.
@@ -109,3 +88,15 @@ def vector_store_index(
             message_error = f"Error while indexing documents for {twin_id}/{source_name}/{file_uuid}"
             self.logger.error(e)
             raise ValueError(message_error)
+
+    def vectorize_string(self, text_input: str) -> list:
+        """
+        Retrieves the embedded value (vector) for the text_input string
+
+        Args:
+            text_input (str): Text to vectorize
+
+        Returns:
+            list: a list of float values representing the text_input vector
+        """
+        return self.embed_model.get_text_embedding(text_input)