Skip to content

Commit db41de8

Browse files
authored
fix: Fix exposing Qdrant api-key in metadata field when running to_dict (#1813)
* Add to_dict test * Add more type hints * More type hints * Add fix for exposing api key in metadata when running to_dict * Add unit test * PR comments
1 parent 12d3415 commit db41de8

File tree

5 files changed

+119
-42
lines changed

5 files changed

+119
-42
lines changed

integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def __init__(
4646
score_threshold: Optional[float] = None,
4747
group_by: Optional[str] = None,
4848
group_size: Optional[int] = None,
49-
):
49+
) -> None:
5050
"""
5151
Create a QdrantEmbeddingRetriever component.
5252
@@ -136,7 +136,7 @@ def run(
136136
score_threshold: Optional[float] = None,
137137
group_by: Optional[str] = None,
138138
group_size: Optional[int] = None,
139-
):
139+
) -> Dict[str, List[Document]]:
140140
"""
141141
Run the Embedding Retriever on the given input data.
142142
@@ -180,7 +180,7 @@ async def run_async(
180180
score_threshold: Optional[float] = None,
181181
group_by: Optional[str] = None,
182182
group_size: Optional[int] = None,
183-
):
183+
) -> Dict[str, List[Document]]:
184184
"""
185185
Asynchronously run the Embedding Retriever on the given input data.
186186
@@ -252,7 +252,7 @@ def __init__(
252252
score_threshold: Optional[float] = None,
253253
group_by: Optional[str] = None,
254254
group_size: Optional[int] = None,
255-
):
255+
) -> None:
256256
"""
257257
Create a QdrantSparseEmbeddingRetriever component.
258258
@@ -342,7 +342,7 @@ def run(
342342
score_threshold: Optional[float] = None,
343343
group_by: Optional[str] = None,
344344
group_size: Optional[int] = None,
345-
):
345+
) -> Dict[str, List[Document]]:
346346
"""
347347
Run the Sparse Embedding Retriever on the given input data.
348348
@@ -391,7 +391,7 @@ async def run_async(
391391
score_threshold: Optional[float] = None,
392392
group_by: Optional[str] = None,
393393
group_size: Optional[int] = None,
394-
):
394+
) -> Dict[str, List[Document]]:
395395
"""
396396
Asynchronously run the Sparse Embedding Retriever on the given input data.
397397
@@ -473,7 +473,7 @@ def __init__(
473473
score_threshold: Optional[float] = None,
474474
group_by: Optional[str] = None,
475475
group_size: Optional[int] = None,
476-
):
476+
) -> None:
477477
"""
478478
Create a QdrantHybridRetriever component.
479479
@@ -557,7 +557,7 @@ def run(
557557
score_threshold: Optional[float] = None,
558558
group_by: Optional[str] = None,
559559
group_size: Optional[int] = None,
560-
):
560+
) -> Dict[str, List[Document]]:
561561
"""
562562
Run the Sparse Embedding Retriever on the given input data.
563563
@@ -606,7 +606,7 @@ async def run_async(
606606
score_threshold: Optional[float] = None,
607607
group_by: Optional[str] = None,
608608
group_size: Optional[int] = None,
609-
):
609+
) -> Dict[str, List[Document]]:
610610
"""
611611
Asynchronously run the Sparse Embedding Retriever on the given input data.
612612

integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py

Lines changed: 31 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import inspect
22
from itertools import islice
3-
from typing import Any, AsyncGenerator, ClassVar, Dict, Generator, List, Optional, Set, Union
3+
from typing import Any, AsyncGenerator, ClassVar, Dict, Generator, List, Optional, Set, Tuple, Union
44

55
import numpy as np
66
import qdrant_client
@@ -18,6 +18,7 @@
1818
from .converters import (
1919
DENSE_VECTORS_NAME,
2020
SPARSE_VECTORS_NAME,
21+
QdrantPoint,
2122
convert_haystack_documents_to_qdrant_points,
2223
convert_id,
2324
convert_qdrant_point_to_haystack_document,
@@ -34,7 +35,7 @@ class QdrantStoreError(DocumentStoreError):
3435
FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]]
3536

3637

37-
def get_batches_from_generator(iterable, n):
38+
def get_batches_from_generator(iterable: List, n: int) -> Generator:
3839
"""
3940
Batch elements of an iterable into fixed-length chunks or blocks.
4041
"""
@@ -127,10 +128,10 @@ def __init__(
127128
write_batch_size: int = 100,
128129
scroll_size: int = 10_000,
129130
payload_fields_to_index: Optional[List[dict]] = None,
130-
):
131+
) -> None:
131132
"""
132133
:param location:
133-
If `memory` - use in-memory Qdrant instance.
134+
If `":memory:"` - use in-memory Qdrant instance.
134135
If `str` - use it as a URL parameter.
135136
If `None` - use default values for host and port.
136137
:param url:
@@ -164,7 +165,7 @@ def __init__(
164165
Dimension of the embeddings.
165166
:param on_disk:
166167
Whether to store the collection on disk.
167-
:param use_sparse_embedding:
168+
:param use_sparse_embeddings:
168169
If set to `True`, enables support for sparse embeddings.
169170
:param sparse_idf:
170171
If set to `True`, computes the Inverse Document Frequency (IDF) when using sparse embeddings.
@@ -232,7 +233,6 @@ def __init__(
232233
self.path = path
233234
self.force_disable_check_same_thread = force_disable_check_same_thread
234235
self.metadata = metadata or {}
235-
self.api_key = api_key
236236

237237
# Store the Qdrant collection specific attributes
238238
self.shard_number = shard_number
@@ -258,9 +258,10 @@ def __init__(
258258
self.write_batch_size = write_batch_size
259259
self.scroll_size = scroll_size
260260

261-
def _initialize_client(self):
261+
def _initialize_client(self) -> None:
262262
if self._client is None:
263263
client_params = self._prepare_client_params()
264+
# This step adds the api-key and User-Agent to metadata
264265
self._client = qdrant_client.QdrantClient(**client_params)
265266
# Make sure the collection is properly set up
266267
self._set_up_collection(
@@ -274,7 +275,7 @@ def _initialize_client(self):
274275
self.payload_fields_to_index,
275276
)
276277

277-
async def _initialize_async_client(self):
278+
async def _initialize_async_client(self) -> None:
278279
"""
279280
Returns the asynchronous Qdrant client, initializing it if necessary.
280281
"""
@@ -628,8 +629,6 @@ def get_documents_by_id(
628629
629630
:param ids:
630631
A list of document IDs to retrieve.
631-
:param index:
632-
The name of the index to retrieve documents from.
633632
:returns:
634633
A list of documents.
635634
"""
@@ -661,8 +660,6 @@ async def get_documents_by_id_async(
661660
662661
:param ids:
663662
A list of document IDs to retrieve.
664-
:param index:
665-
The name of the index to retrieve documents from.
666663
:returns:
667664
A list of documents.
668665
"""
@@ -1210,7 +1207,7 @@ def get_distance(self, similarity: str) -> rest.Distance:
12101207
)
12111208
raise QdrantStoreError(msg) from ke
12121209

1213-
def _create_payload_index(self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None):
1210+
def _create_payload_index(self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None) -> None:
12141211
"""
12151212
Create payload index for the collection if payload_fields_to_index is provided
12161213
See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
@@ -1229,7 +1226,7 @@ def _create_payload_index(self, collection_name: str, payload_fields_to_index: O
12291226

12301227
async def _create_payload_index_async(
12311228
self, collection_name: str, payload_fields_to_index: Optional[List[dict]] = None
1232-
):
1229+
) -> None:
12331230
"""
12341231
Asynchronously create payload index for the collection if payload_fields_to_index is provided
12351232
See: https://qdrant.tech/documentation/concepts/indexing/#payload-index
@@ -1257,7 +1254,7 @@ def _set_up_collection(
12571254
sparse_idf: bool,
12581255
on_disk: bool = False,
12591256
payload_fields_to_index: Optional[List[dict]] = None,
1260-
):
1257+
) -> None:
12611258
"""
12621259
Sets up the Qdrant collection with the specified parameters.
12631260
:param collection_name:
@@ -1313,7 +1310,7 @@ async def _set_up_collection_async(
13131310
sparse_idf: bool,
13141311
on_disk: bool = False,
13151312
payload_fields_to_index: Optional[List[dict]] = None,
1316-
):
1313+
) -> None:
13171314
"""
13181315
Asynchronously sets up the Qdrant collection with the specified parameters.
13191316
:param collection_name:
@@ -1367,7 +1364,7 @@ def recreate_collection(
13671364
on_disk: Optional[bool] = None,
13681365
use_sparse_embeddings: Optional[bool] = None,
13691366
sparse_idf: bool = False,
1370-
):
1367+
) -> None:
13711368
"""
13721369
Recreates the Qdrant collection with the specified parameters.
13731370
@@ -1410,7 +1407,7 @@ async def recreate_collection_async(
14101407
on_disk: Optional[bool] = None,
14111408
use_sparse_embeddings: Optional[bool] = None,
14121409
sparse_idf: bool = False,
1413-
):
1410+
) -> None:
14141411
"""
14151412
Asynchronously recreates the Qdrant collection with the specified parameters.
14161413
@@ -1449,7 +1446,7 @@ def _handle_duplicate_documents(
14491446
self,
14501447
documents: List[Document],
14511448
policy: DuplicatePolicy = None,
1452-
):
1449+
) -> List[Document]:
14531450
"""
14541451
Checks whether any of the passed documents is already existing in the chosen index and returns a list of
14551452
documents that are not in the index yet.
@@ -1476,7 +1473,7 @@ async def _handle_duplicate_documents_async(
14761473
self,
14771474
documents: List[Document],
14781475
policy: DuplicatePolicy = None,
1479-
):
1476+
) -> List[Document]:
14801477
"""
14811478
Asynchronously checks whether any of the passed documents is already existing
14821479
in the chosen index and returns a list of
@@ -1521,7 +1518,7 @@ def _drop_duplicate_documents(self, documents: List[Document]) -> List[Document]
15211518

15221519
return _documents
15231520

1524-
def _prepare_collection_params(self):
1521+
def _prepare_collection_params(self) -> Dict[str, Any]:
15251522
"""
15261523
Prepares the common parameters for collection creation.
15271524
"""
@@ -1537,7 +1534,7 @@ def _prepare_collection_params(self):
15371534
"init_from": self.init_from,
15381535
}
15391536

1540-
def _prepare_client_params(self):
1537+
def _prepare_client_params(self) -> Dict[str, Any]:
15411538
"""
15421539
Prepares the common parameters for client initialization.
15431540
@@ -1554,7 +1551,10 @@ def _prepare_client_params(self):
15541551
"timeout": self.timeout,
15551552
"host": self.host,
15561553
"path": self.path,
1557-
"metadata": self.metadata,
1554+
# NOTE: We purposefully expand the fields of self.metadata to avoid modifying the original self.metadata
1555+
# class attribute. For example, the resolved api key is added to metadata by the QdrantClient class
1556+
# when using a hosted Qdrant service, which means running to_dict() exposes the api key.
1557+
"metadata": {**self.metadata},
15581558
"force_disable_check_same_thread": self.force_disable_check_same_thread,
15591559
}
15601560

@@ -1565,7 +1565,7 @@ def _prepare_collection_config(
15651565
on_disk: Optional[bool] = None,
15661566
use_sparse_embeddings: Optional[bool] = None,
15671567
sparse_idf: bool = False,
1568-
):
1568+
) -> Tuple[Dict[str, rest.VectorParams], Optional[Dict[str, rest.SparseVectorParams]]]:
15691569
"""
15701570
Prepares the configuration for creating or recreating a Qdrant collection.
15711571
@@ -1595,9 +1595,12 @@ def _prepare_collection_config(
15951595

15961596
return vectors_config, sparse_vectors_config
15971597

1598-
def _validate_filters(self, filters: Optional[Union[Dict[str, Any], rest.Filter]] = None):
1598+
def _validate_filters(self, filters: Optional[Union[Dict[str, Any], rest.Filter]] = None) -> None:
15991599
"""
16001600
Validates the filters provided for querying.
1601+
1602+
:param filters: Filters to validate. Can be a dictionary or an instance of `qdrant_client.http.models.Filter`.
1603+
:raises ValueError: If the filters are not in the correct format or syntax.
16011604
"""
16021605
if filters and not isinstance(filters, dict) and not isinstance(filters, rest.Filter):
16031606
msg = "Filter must be a dictionary or an instance of `qdrant_client.http.models.Filter`"
@@ -1607,7 +1610,7 @@ def _validate_filters(self, filters: Optional[Union[Dict[str, Any], rest.Filter]
16071610
msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
16081611
raise ValueError(msg)
16091612

1610-
def _process_query_point_results(self, results, scale_score: bool = False):
1613+
def _process_query_point_results(self, results: List[QdrantPoint], scale_score: bool = False) -> List[Document]:
16111614
"""
16121615
Processes query results from Qdrant.
16131616
"""
@@ -1627,7 +1630,7 @@ def _process_query_point_results(self, results, scale_score: bool = False):
16271630

16281631
return documents
16291632

1630-
def _process_group_results(self, groups):
1633+
def _process_group_results(self, groups: List[rest.PointGroup]) -> List[Document]:
16311634
"""
16321635
Processes grouped query results from Qdrant.
16331636
@@ -1647,7 +1650,7 @@ def _validate_collection_compatibility(
16471650
collection_info,
16481651
distance,
16491652
embedding_dim: int,
1650-
):
1653+
) -> None:
16511654
"""
16521655
Validates that an existing collection is compatible with the current configuration.
16531656
"""

integrations/qdrant/src/haystack_integrations/document_stores/qdrant/filters.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,10 +138,10 @@ def convert_filters_to_qdrant(
138138

139139

140140
def build_filters_for_repeated_operators(
141-
must_clauses,
142-
should_clauses,
143-
must_not_clauses,
144-
qdrant_filter,
141+
must_clauses: List,
142+
should_clauses: List,
143+
must_not_clauses: List,
144+
qdrant_filter: List[models.Filter],
145145
) -> List[models.Filter]:
146146
"""
147147
Flattens the nested lists of clauses by creating separate Filters for each clause of a logical operator.

integrations/qdrant/src/haystack_integrations/document_stores/qdrant/migrate_to_sparse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
logger.setLevel(python_logging.INFO)
1212

1313

14-
def migrate_to_sparse_embeddings_support(old_document_store: QdrantDocumentStore, new_index: str):
14+
def migrate_to_sparse_embeddings_support(old_document_store: QdrantDocumentStore, new_index: str) -> None:
1515
"""
1616
Utility function to migrate an existing `QdrantDocumentStore` to a new one with support for sparse embeddings.
1717

0 commit comments

Comments
 (0)