Skip to content

Commit beb03de

Browse files
minglu7pquentinmiguelgrinberg
authored
feat: extend default settings in AsyncVectorStore (elastic#2602)
* feat: extend default settings in AsyncVectorStore This commit introduces a new parameter, custom_settings, to the AsyncVectorStore class. This allows users to provide their own settings that will extend the default settings. This increases the flexibility of the class and allows it to be tailored to specific use cases. The custom settings are applied in the _create_index_if_not_exists method. * Update vectorstore.py * Update vectorstore.py apply changes in vectorstore * Update vectorstore.py format the py file * Update test_vectorstore.py add test_custom_index_settings in test_vectorstore * Update test_vectorstore.py * Update vectorstore.py fix file format * Update test_vectorstore.py fix format * Update vectorstore.py add error tips in vectorstore when confilicting the settings * Update vectorstore.py * Update vectorstore.py modify the comments of the param custom_index_settings * Update vectorstore.py * add settings conflict test * reformat --------- Co-authored-by: Quentin Pradet <[email protected]> Co-authored-by: Miguel Grinberg <[email protected]>
1 parent 058abd3 commit beb03de

File tree

3 files changed

+119
-0
lines changed

3 files changed

+119
-0
lines changed

elasticsearch/helpers/vectorstore/_async/vectorstore.py

+17
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ def __init__(
6060
vector_field: str = "vector_field",
6161
metadata_mappings: Optional[Dict[str, Any]] = None,
6262
user_agent: str = f"elasticsearch-py-vs/{lib_version}",
63+
custom_index_settings: Optional[Dict[str, Any]] = None,
6364
) -> None:
6465
"""
6566
:param user_header: user agent header specific to the 3rd party integration.
@@ -72,6 +73,11 @@ def __init__(
7273
the embedding vector goes in this field.
7374
:param client: Elasticsearch client connection. Alternatively specify the
7475
Elasticsearch connection with the other es_* parameters.
76+
:param custom_index_settings: A dictionary of custom settings for the index.
77+
This can include configurations like the number of shards, number of replicas,
78+
analysis settings, and other index-specific settings. If not provided, default
79+
settings will be used. Note that if the same setting is provided by both the user
80+
and the strategy, will raise an error.
7581
"""
7682
# Add integration-specific usage header for tracking usage in Elastic Cloud.
7783
# client.options preserves existing (non-user-agent) headers.
@@ -90,6 +96,7 @@ def __init__(
9096
self.text_field = text_field
9197
self.vector_field = vector_field
9298
self.metadata_mappings = metadata_mappings
99+
self.custom_index_settings = custom_index_settings
93100

94101
async def close(self) -> None:
95102
return await self.client.close()
@@ -306,6 +313,16 @@ async def _create_index_if_not_exists(self) -> None:
306313
vector_field=self.vector_field,
307314
num_dimensions=self.num_dimensions,
308315
)
316+
317+
if self.custom_index_settings:
318+
conflicting_keys = set(self.custom_index_settings.keys()) & set(
319+
settings.keys()
320+
)
321+
if conflicting_keys:
322+
raise ValueError(f"Conflicting settings: {conflicting_keys}")
323+
else:
324+
settings.update(self.custom_index_settings)
325+
309326
if self.metadata_mappings:
310327
metadata = mappings["properties"].get("metadata", {"properties": {}})
311328
for key in self.metadata_mappings.keys():

elasticsearch/helpers/vectorstore/_sync/vectorstore.py

+17
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def __init__(
5757
vector_field: str = "vector_field",
5858
metadata_mappings: Optional[Dict[str, Any]] = None,
5959
user_agent: str = f"elasticsearch-py-vs/{lib_version}",
60+
custom_index_settings: Optional[Dict[str, Any]] = None,
6061
) -> None:
6162
"""
6263
:param user_header: user agent header specific to the 3rd party integration.
@@ -69,6 +70,11 @@ def __init__(
6970
the embedding vector goes in this field.
7071
:param client: Elasticsearch client connection. Alternatively specify the
7172
Elasticsearch connection with the other es_* parameters.
73+
:param custom_index_settings: A dictionary of custom settings for the index.
74+
This can include configurations like the number of shards, number of replicas,
75+
analysis settings, and other index-specific settings. If not provided, default
76+
settings will be used. Note that if the same setting is provided by both the user
77+
and the strategy, will raise an error.
7278
"""
7379
# Add integration-specific usage header for tracking usage in Elastic Cloud.
7480
# client.options preserves existing (non-user-agent) headers.
@@ -87,6 +93,7 @@ def __init__(
8793
self.text_field = text_field
8894
self.vector_field = vector_field
8995
self.metadata_mappings = metadata_mappings
96+
self.custom_index_settings = custom_index_settings
9097

9198
def close(self) -> None:
9299
return self.client.close()
@@ -303,6 +310,16 @@ def _create_index_if_not_exists(self) -> None:
303310
vector_field=self.vector_field,
304311
num_dimensions=self.num_dimensions,
305312
)
313+
314+
if self.custom_index_settings:
315+
conflicting_keys = set(self.custom_index_settings.keys()) & set(
316+
settings.keys()
317+
)
318+
if conflicting_keys:
319+
raise ValueError(f"Conflicting settings: {conflicting_keys}")
320+
else:
321+
settings.update(self.custom_index_settings)
322+
306323
if self.metadata_mappings:
307324
metadata = mappings["properties"].get("metadata", {"properties": {}})
308325
for key in self.metadata_mappings.keys():

test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py

+85
Original file line numberDiff line numberDiff line change
@@ -907,3 +907,88 @@ def test_metadata_mapping(self, sync_client: Elasticsearch, index: str) -> None:
907907
assert "metadata" in mapping_properties
908908
for key, val in test_mappings.items():
909909
assert mapping_properties["metadata"]["properties"][key] == val
910+
911+
def test_custom_index_settings(
912+
self, sync_client: Elasticsearch, index: str
913+
) -> None:
914+
"""Test that the custom index settings are applied."""
915+
test_settings = {
916+
"analysis": {
917+
"tokenizer": {
918+
"custom_tokenizer": {"type": "pattern", "pattern": "[,;\\s]+"}
919+
},
920+
"analyzer": {
921+
"custom_analyzer": {
922+
"type": "custom",
923+
"tokenizer": "custom_tokenizer",
924+
}
925+
},
926+
}
927+
}
928+
929+
test_mappings = {
930+
"my_field": {"type": "keyword"},
931+
"another_field": {"type": "text", "analyzer": "custom_analyzer"},
932+
}
933+
934+
store = VectorStore(
935+
index=index,
936+
retrieval_strategy=DenseVectorStrategy(distance=DistanceMetric.COSINE),
937+
embedding_service=FakeEmbeddings(),
938+
num_dimensions=10,
939+
client=sync_client,
940+
metadata_mappings=test_mappings,
941+
custom_index_settings=test_settings,
942+
)
943+
944+
sample_texts = [
945+
"Sample text one, with some keywords.",
946+
"Another; sample, text with; different keywords.",
947+
"Third example text, with more keywords.",
948+
]
949+
store.add_texts(texts=sample_texts)
950+
951+
# Fetch the actual index settings from Elasticsearch
952+
actual_settings = sync_client.indices.get_settings(index=index)
953+
954+
# Assert that the custom settings were applied correctly
955+
custom_settings_applied = actual_settings[index]["settings"]["index"][
956+
"analysis"
957+
]
958+
assert (
959+
custom_settings_applied == test_settings["analysis"]
960+
), f"Expected custom index settings {test_settings} but got {custom_settings_applied}"
961+
962+
def test_custom_index_settings_with_collision(
963+
self, sync_client: Elasticsearch, index: str
964+
) -> None:
965+
"""Test that custom index settings that collide cause an error."""
966+
test_settings = {
967+
"default_pipeline": "my_pipeline",
968+
"analysis": {
969+
"tokenizer": {
970+
"custom_tokenizer": {"type": "pattern", "pattern": "[,;\\s]+"}
971+
},
972+
"analyzer": {
973+
"custom_analyzer": {
974+
"type": "custom",
975+
"tokenizer": "custom_tokenizer",
976+
}
977+
},
978+
},
979+
}
980+
981+
test_mappings = {
982+
"my_field": {"type": "keyword"},
983+
"another_field": {"type": "text", "analyzer": "custom_analyzer"},
984+
}
985+
986+
store = VectorStore(
987+
index=index,
988+
retrieval_strategy=SparseVectorStrategy(),
989+
client=sync_client,
990+
metadata_mappings=test_mappings,
991+
custom_index_settings=test_settings,
992+
)
993+
with pytest.raises(ValueError, match="Conflicting settings"):
994+
store.add_texts(texts=["some text"])

0 commit comments

Comments
 (0)